ClusterProfiler (BioIn Test 2)

Details & Data Used

Setup

Install if needed

# if (!require("BiocManager", quietly = TRUE))
#     install.packages("BiocManager")
# 
# BiocManager::install("clusterProfiler")
# BiocManager::install("msigdbr")
# BiocManager::install("org.Rn.eg.db")
# BiocManager::install("enrichplot")
# BiocManager::install("pathview")

Load Packages

library(tidyverse) # for dplyr, ggplot, stringr, readr etc 
library(clusterProfiler) #GSEA includes gsea, enrichplot etc.
library(org.Rn.eg.db) # Rat genome annotation package - incl. AnnotationDbi
library(msigdbr) # MSigDB gene sets
library(enrichplot) # for dotplot()
library(ggridges) # for ridgeplot()
library(pathview) # pathview() - Kegg pathway visualisation

GSEA

Prepare Protein List into Ranked Gene list

Import Protein List (MSstatsTMT comparison output)

# Set working directory to source file location

# read in protein list df from MSstatsTMT comparison
df <- read_csv(
  "input/protein_lists/2025_09_12_rp_1pspg_test_pairwise_msstats_v3.1.5_t8.csv")
## New names:
## Rows: 7746 Columns: 9
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (3): Protein, Label, issue dbl (6): ...1, log2FC, SE, DF, pvalue, adj.pvalue
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
lil checks
# lil checks
head(df) # peak
## # A tibble: 6 × 9
##    ...1 Protein    Label     log2FC     SE    DF  pvalue adj.pvalue issue
##   <dbl> <chr>      <chr>      <dbl>  <dbl> <dbl>   <dbl>      <dbl> <chr>
## 1     1 A0A096MJ11 EE vs SH  0.0145 0.0369  32.3 0.697        0.934 <NA> 
## 2     2 A0A096MJ32 EE vs SH -0.354  0.151   12   0.0365       0.536 <NA> 
## 3     3 A0A096MJ78 EE vs SH -0.0986 0.0317  57.7 0.00288      0.306 <NA> 
## 4     4 A0A096MJ94 EE vs SH -0.0598 0.182   12   0.749        0.950 <NA> 
## 5     5 A0A096MJ99 EE vs SH -0.0497 0.0329  51.4 0.138        0.696 <NA> 
## 6     6 A0A096MJA9 EE vs SH  0.0167 0.0590  46.4 0.778        0.955 <NA>
str(df) # structure | 7746 x 9
## spc_tbl_ [7,746 × 9] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ ...1      : num [1:7746] 1 2 3 4 5 6 7 8 9 10 ...
##  $ Protein   : chr [1:7746] "A0A096MJ11" "A0A096MJ32" "A0A096MJ78" "A0A096MJ94" ...
##  $ Label     : chr [1:7746] "EE vs SH" "EE vs SH" "EE vs SH" "EE vs SH" ...
##  $ log2FC    : num [1:7746] 0.0145 -0.3543 -0.0986 -0.0598 -0.0497 ...
##  $ SE        : num [1:7746] 0.0369 0.1505 0.0317 0.1825 0.0329 ...
##  $ DF        : num [1:7746] 32.3 12 57.7 12 51.4 ...
##  $ pvalue    : num [1:7746] 0.69717 0.0365 0.00288 0.74896 0.13751 ...
##  $ adj.pvalue: num [1:7746] 0.934 0.536 0.306 0.95 0.696 ...
##  $ issue     : chr [1:7746] NA NA NA NA ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   ...1 = col_double(),
##   ..   Protein = col_character(),
##   ..   Label = col_character(),
##   ..   log2FC = col_double(),
##   ..   SE = col_double(),
##   ..   DF = col_double(),
##   ..   pvalue = col_double(),
##   ..   adj.pvalue = col_double(),
##   ..   issue = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>
colnames(df) # column names
## [1] "...1"       "Protein"    "Label"      "log2FC"     "SE"        
## [6] "DF"         "pvalue"     "adj.pvalue" "issue"
# lil checks (bit messy)
unique(df$issue) # if = na in issue column then good. if not exclude
## [1] NA                    "oneConditionMissing"
df %>% filter(str_detect(Protein, "Cont_")) # rows with contaminants - to exlude
## # A tibble: 21 × 9
##     ...1 Protein     Label      log2FC     SE    DF pvalue adj.pvalue issue
##    <dbl> <chr>       <chr>       <dbl>  <dbl> <dbl>  <dbl>      <dbl> <chr>
##  1  1875 Cont_O77727 EE vs SH  0.112   0.208   12   0.598       0.904 <NA> 
##  2  1876 Cont_P00761 EE vs SH -0.0316  0.0486  51.3 0.519       0.882 <NA> 
##  3  1877 Cont_P00883 EE vs SH -0.0477  0.0545  12   0.398       0.845 <NA> 
##  4  1878 Cont_P02769 EE vs SH -0.102   0.134   54.5 0.449       0.864 <NA> 
##  5  1879 Cont_P04264 EE vs SH -0.336   0.213   51.4 0.121       0.682 <NA> 
##  6  1880 Cont_P09870 EE vs SH  0.0783  0.173   12   0.659       0.921 <NA> 
##  7  1881 Cont_P13645 EE vs SH -0.386   0.180   53.6 0.0369      0.538 <NA> 
##  8  1882 Cont_P19012 EE vs SH -0.0800  0.108   12   0.474       0.871 <NA> 
##  9  1883 Cont_P30879 EE vs SH  0.0168  0.0297  40.0 0.576       0.897 <NA> 
## 10  1884 Cont_P34955 EE vs SH -0.00468 0.0900  12   0.959       0.990 <NA> 
## # ℹ 11 more rows
df %>% filter(str_detect(log2FC, "Inf")) # rows with Infinity /no log fold - to exclude
## # A tibble: 1 × 9
##    ...1 Protein    Label    log2FC    SE    DF pvalue adj.pvalue issue          
##   <dbl> <chr>      <chr>     <dbl> <dbl> <dbl>  <dbl>      <dbl> <chr>          
## 1   134 A0A0G2JYB7 EE vs SH    Inf    NA    NA     NA         NA oneConditionMi…
df %>% filter(is.na(pvalue)) # same observation as the "Inf" in log2FC
## # A tibble: 1 × 9
##    ...1 Protein    Label    log2FC    SE    DF pvalue adj.pvalue issue          
##   <dbl> <chr>      <chr>     <dbl> <dbl> <dbl>  <dbl>      <dbl> <chr>          
## 1   134 A0A0G2JYB7 EE vs SH    Inf    NA    NA     NA         NA oneConditionMi…
df %>% filter(is.na(adj.pvalue)) # same observation as the "Inf" in log2FC
## # A tibble: 1 × 9
##    ...1 Protein    Label    log2FC    SE    DF pvalue adj.pvalue issue          
##   <dbl> <chr>      <chr>     <dbl> <dbl> <dbl>  <dbl>      <dbl> <chr>          
## 1   134 A0A0G2JYB7 EE vs SH    Inf    NA    NA     NA         NA oneConditionMi…

Clean up df & rename

protein_comp <- df %>%
  filter(is.na(issue)) %>% # keeps rows with issue == na (so comparisons that are ok)
  filter(!str_detect(Protein, "Cont_")) %>% # remove contaminant proteins
  filter(!str_detect(log2FC, "Inf")) %>% # remove infinity/missing fold change values
  dplyr::select(Protein:adj.pvalue) # rm weird ...1 col & issue col, skip this? 

# check 
head(protein_comp) 
## # A tibble: 6 × 7
##   Protein    Label     log2FC     SE    DF  pvalue adj.pvalue
##   <chr>      <chr>      <dbl>  <dbl> <dbl>   <dbl>      <dbl>
## 1 A0A096MJ11 EE vs SH  0.0145 0.0369  32.3 0.697        0.934
## 2 A0A096MJ32 EE vs SH -0.354  0.151   12   0.0365       0.536
## 3 A0A096MJ78 EE vs SH -0.0986 0.0317  57.7 0.00288      0.306
## 4 A0A096MJ94 EE vs SH -0.0598 0.182   12   0.749        0.950
## 5 A0A096MJ99 EE vs SH -0.0497 0.0329  51.4 0.138        0.696
## 6 A0A096MJA9 EE vs SH  0.0167 0.0590  46.4 0.778        0.955
str(protein_comp) # 7724 x 7
## tibble [7,724 × 7] (S3: tbl_df/tbl/data.frame)
##  $ Protein   : chr [1:7724] "A0A096MJ11" "A0A096MJ32" "A0A096MJ78" "A0A096MJ94" ...
##  $ Label     : chr [1:7724] "EE vs SH" "EE vs SH" "EE vs SH" "EE vs SH" ...
##  $ log2FC    : num [1:7724] 0.0145 -0.3543 -0.0986 -0.0598 -0.0497 ...
##  $ SE        : num [1:7724] 0.0369 0.1505 0.0317 0.1825 0.0329 ...
##  $ DF        : num [1:7724] 32.3 12 57.7 12 51.4 ...
##  $ pvalue    : num [1:7724] 0.69717 0.0365 0.00288 0.74896 0.13751 ...
##  $ adj.pvalue: num [1:7724] 0.934 0.536 0.306 0.95 0.696 ...
colnames(protein_comp)
## [1] "Protein"    "Label"      "log2FC"     "SE"         "DF"        
## [6] "pvalue"     "adj.pvalue"

Convert UniProt Accessions into EntrezID using bitr()

  • bitr: Biological ID Translator

  • Note Entrez id is a central gene identifier - so want to map uniprot accession with gene ID which then can be mapped to other IDs.

    • Side note - DO NOT ANNOTATE WITH MULTIPLE TOTYPE e.g. toType = c(“ENTREZID”, “SYMBOL”, “GENENAME”), will result in duplication errors since various gene names or symbols get mapped on?
keytypes(org.Rn.eg.db) # key types available
##  [1] "ACCNUM"       "ALIAS"        "ENSEMBL"      "ENSEMBLPROT"  "ENSEMBLTRANS"
##  [6] "ENTREZID"     "ENZYME"       "EVIDENCE"     "EVIDENCEALL"  "GENENAME"    
## [11] "GENETYPE"     "GO"           "GOALL"        "IPI"          "ONTOLOGY"    
## [16] "ONTOLOGYALL"  "PATH"         "PFAM"         "PMID"         "PROSITE"     
## [21] "REFSEQ"       "SYMBOL"       "UNIPROT"
# 7158/7724 mapped (8.05% missing)
entrezID <- bitr(protein_comp$Protein, # input df
             fromType = "UNIPROT", # input type
             toType = "ENTREZID", # output type
             OrgDb = org.Rn.eg.db) # annotation db
## 'select()' returned 1:many mapping between keys and columns
## Warning in bitr(protein_comp$Protein, fromType = "UNIPROT", toType =
## "ENTREZID", : 8.05% of input gene IDs are fail to map...
# UNIPROT -> ENTREZID - 8.05% of input gene failed to map
# UNIPROT -> ENSEMBL 12.91% of input gene IDs are fail to map...
# UNIPROT -> REFSEQ - 8.05% of input gene IDs are fail to map...
# UNIPROT -> SYMBOL -  8.05% of input gene IDs are fail to map...

Merge EntrezID onto protein list

protein_genes <- merge(entrezID, protein_comp,
                       by.x = "UNIPROT",
                       by.y = "Protein")

head(protein_genes)
##      UNIPROT ENTREZID    Label      log2FC         SE       DF      pvalue
## 1 A0A096MJ11   315084 EE vs SH  0.01450424 0.03693997 32.25947 0.697165337
## 2 A0A096MJ32   302612 EE vs SH -0.35425998 0.15054311 12.00000 0.036498920
## 3 A0A096MJ78   308478 EE vs SH -0.09856990 0.03166410 57.67314 0.002881095
## 4 A0A096MJ94   362426 EE vs SH -0.05975939 0.18248997 12.00000 0.748957106
## 5 A0A096MJ99   364033 EE vs SH -0.04965287 0.03291150 51.35170 0.137509534
## 6 A0A096MJA9   312981 EE vs SH  0.01668589 0.05896103 46.41957 0.778436130
##   adj.pvalue
## 1  0.9336827
## 2  0.5364025
## 3  0.3056723
## 4  0.9497256
## 5  0.6963620
## 6  0.9545115
dim(protein_genes)
## [1] 7158    8
#write out?

Prepare Ranked Gene List

# use sign(df$log2fc)*(-log10(df$pval)) + BSS & PNNL

# using 
geneList_df <- protein_genes %>%
  mutate(ranking = sign(protein_genes$log2FC)*(-log10(protein_genes$pvalue))) %>%
  group_by(ENTREZID) %>%
  summarise(ranking = mean(ranking, na.rm = TRUE)) %>%
  arrange(-ranking)

# can export 

dim(geneList_df) # 7153 x 2 - same as prev test! 
## [1] 7153    2
# pvalue & log2FC  min = -7.6191 max 4.559
# adj.pvalue * log2FC = max 1.3938 & min -3.7302

# ~ ~ ~ ~ ~ ~ ~ 
# convert into a named vector to supply to fgsea()
geneList <- tibble::deframe(geneList_df) 

head(geneList)
##   288080   689954    29237   191595   299282   683788 
## 4.559848 4.437793 4.129179 3.828775 3.765140 3.762137
tail(geneList)
##    360814     25379    363016    361663    114488     29637 
## -4.306857 -4.335003 -4.474002 -4.556742 -4.753525 -7.619261
plot(geneList)

sum(is.na(geneList)) # 0 | without extra line (not sure why previous version had NA)
## [1] 0

Running GSEA via clusterProfiler

msigbdr

# # msigdbr helper functions
# msigdbr_species() # 20 species
# msigdbr_collections() # 33730584 bytes?
# unique(msigdbr_df$db_version) # check ver
# citation("msigdbr") # how to cite

Hallmark Collection (H)

# Preps for fgsea()  

# # example
# # Get CGP (chemical and genetic perturbations) gene sets with genes mapped to rat
# orthologs gs <- msigdbr(species = "Rattus norvegicus",
#                         collection = "C2",
#                         subcollection = "CGP")

# pull in hallmark genesets from msigdbr, maps rat orthologs to human geneset 
hallmark_gs_df <- msigdbr(species = "Rattus norvegicus", # my species
                             collection = "H") # Hallmark collection
                             
head(hallmark_gs_df) 
## # A tibble: 6 × 23
##   gene_symbol ncbi_gene ensembl_gene db_gene_symbol db_ncbi_gene db_ensembl_gene
##   <chr>       <chr>     <chr>        <chr>          <chr>        <chr>          
## 1 Abca1       313210    ENSRNOG0000… ABCA1          19           ENSG00000165029
## 2 Abcb8       362302    ENSRNOG0000… ABCB8          11194        ENSG00000197150
## 3 Acaa2       170465    ENSRNOG0000… ACAA2          10449        ENSG00000167315
## 4 Acadl       25287     ENSRNOG0000… ACADL          33           ENSG00000115361
## 5 Acadm       24158     ENSRNOG0000… ACADM          34           ENSG00000117054
## 6 Acads       64304     ENSRNOG0000… ACADS          35           ENSG00000122971
## # ℹ 17 more variables: source_gene <chr>, gs_id <chr>, gs_name <chr>,
## #   gs_collection <chr>, gs_subcollection <chr>, gs_collection_name <chr>,
## #   gs_description <chr>, gs_source_species <chr>, gs_pmid <chr>,
## #   gs_geoid <chr>, gs_exact_source <chr>, gs_url <chr>, db_version <chr>,
## #   db_target_species <chr>, ortholog_taxon_id <int>, ortholog_sources <chr>,
## #   num_ortholog_sources <dbl>
dim(hallmark_gs_df) # 7294 x 23
## [1] 7294   23
# ~ ~ ~ ~ ~ ~ ~ 
# Convert in 2 name list format required for fgsea()
# Splits in separate lists containing Entrez grouped by the geneset name (gs_name)
# ENTREZID is ncbi_gene?
hallmark_genesets <-  hallmark_gs_df %>%
  split(x = .$ncbi_gene, # containing - df/vector
        f = .$gs_name) # file - factor

#str(hallmark_genesets) # list of 50
# Preps TERM2GENE for GSEA() | 1st col = term_id/geneset & 2nd col = mapped gene
msig_h <- msigdbr(species = "Rattus norvegicus",
                  collection = "H") %>%
 dplyr::select(gs_name, ncbi_gene) # get only geneset name & corresponding ENTREZID

head(msig_h)
## # A tibble: 6 × 2
##   gs_name               ncbi_gene
##   <chr>                 <chr>    
## 1 HALLMARK_ADIPOGENESIS 313210   
## 2 HALLMARK_ADIPOGENESIS 362302   
## 3 HALLMARK_ADIPOGENESIS 170465   
## 4 HALLMARK_ADIPOGENESIS 25287    
## 5 HALLMARK_ADIPOGENESIS 24158    
## 6 HALLMARK_ADIPOGENESIS 64304
dim(msig_h) #  7294 x 2
## [1] 7294    2

Running gsea with GSEA()

# using GSEA() from clusterProfiler

# uses fgseaMultilevel() function? | auto 
GSEA_hallmark <- GSEA(geneList = geneList, # ranked gene list
                      TERM2GENE = msig_h, # genesets
                      by = 'fgsea' # method - fgsea or DOSE
                      )
## using 'fgsea' for GSEA analysis, please cite Korotkevich et al (2019).
## preparing geneSet collections...
## GSEA analysis...
## Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (0.73% of the list).
## The order of those tied genes will be arbitrary, which may produce unexpected results.
## Warning in fgseaMultilevel(pathways = pathways, stats = stats, minSize =
## minSize, : For some of the pathways the P-values were likely overestimated. For
## such pathways log2err is set to NA.
## Warning in fgseaMultilevel(pathways = pathways, stats = stats, minSize =
## minSize, : For some pathways, in reality P-values are less than 1e-10. You can
## set the `eps` argument to zero for better estimation.
## leading edge analysis...
## done...
# Output has lots of extra info! results + genesets, Genelist, input params! -> save rda
# can make into tibble for easy viewing 
GSEA_hallmark_tibble <- as_tibble(GSEA_hallmark)

Plots

enrichplot::dotplot(GSEA_hallmark,
                    showCategory = 15,
                    title = "Housing - EE vs. SH (t8 - t2)")

dotplot(GSEA_hallmark,
        split = ".sign", # split by sign - positive vs negative enrichment score
        title = "Housing - EE vs. SH (t8 - t2) - split by sign",
        font.size = 11) +  
  facet_grid(.~.sign) # split plots by sign

GSEA plot using gseaplot() from enrichplot

  • Look into making into a function!
# GSEA plot - positive - most sig [1] - Hallmark_oxidative_phosphorylation
gseaplot(GSEA_hallmark, 
         geneSetID = 1, # 1st geneset is 1
         by = "all", # running score or position
         title = GSEA_hallmark$Description[1])

# GSEA plot - positive - most sig [2] - Hallmark_MYC_targets_v1
gseaplot(GSEA_hallmark, 
         geneSetID = 2, # 2nd geneset is 2
         by = "all", # running score or position
         title = GSEA_hallmark$Description[2])

Ridgeplot

ridgeplot(GSEA_hallmark) + labs(x = "enrichment distribution")
## Picking joint bandwidth of 0.236

C5 Ontology gene sets (GO)

# Preps TERM2GENE for GSEA() | 1st col = term_id/geneset & 2nd col = mapped gene
msig_C5_CC <- msigdbr(species = "Rattus norvegicus",
                      collection = "C5", # C5 - ontology set
                      subcollection = "CC") %>% # GO cellular component
  dplyr::select(gs_name, ncbi_gene)

dim(msig_C5_CC) # 98011 x 2 for C5 CC!!!
## [1] 98011     2
# using GSEA() from clusterProfiler
# uses fgseaMultilevel() function? | auto 
GSEA_msig_C5_CC <- GSEA(geneList = geneList, # ranked gene list
                      TERM2GENE = msig_C5_CC, # genesets
                      #organism = "rno",
                      by = 'fgsea') # method - fgsea or DOSE
## using 'fgsea' for GSEA analysis, please cite Korotkevich et al (2019).
## preparing geneSet collections...
## GSEA analysis...
## Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (0.73% of the list).
## The order of those tied genes will be arbitrary, which may produce unexpected results.
## Warning in fgseaMultilevel(pathways = pathways, stats = stats, minSize =
## minSize, : There were 9 pathways for which P-values were not calculated
## properly due to unbalanced (positive and negative) gene-level statistic values.
## For such pathways pval, padj, NES, log2err are set to NA. You can try to
## increase the value of the argument nPermSimple (for example set it nPermSimple
## = 10000)
## Warning in fgseaMultilevel(pathways = pathways, stats = stats, minSize =
## minSize, : For some of the pathways the P-values were likely overestimated. For
## such pathways log2err is set to NA.
## Warning in fgseaMultilevel(pathways = pathways, stats = stats, minSize =
## minSize, : For some pathways, in reality P-values are less than 1e-10. You can
## set the `eps` argument to zero for better estimation.
## leading edge analysis...
## done...
# had to restart R to fix error | worked after - see warnings.                      
                      

# Output has lots of extra info! results + genesets, Genelist, input params! -> save rda
# can make into tibble for easy viewing 
GSEA_msig_C5_CC_tibble <- as_tibble(GSEA_msig_C5_CC)
# 102 significant beyond adj pvalue 0.05 
# neuronal & synaptic ones activated in EE. phew! 
dotplot(GSEA_msig_C5_CC,
        split = ".sign", # split by pos vs neg enrichment score
        showCategory = 102,
        font.size = 11,
        label_format = 55,
        title = "Housing - EE vs. SH (t8 - t2) - GSEA_msig_C5_CC") +
  facet_grid(.~.sign) # split plots by sign

# simplify
sim_msig_go_cc <- pairwise_termsim(GSEA_msig_C5_CC)

NOTE TO SELF - so many other curated genesets available on MSigDB!!! Look into!

GO via gseGO

gseGO CC

gseGO_cc <- gseGO(geneList = geneList, 
             ont ="CC", # cell component
             keyType = "ENTREZID",
             OrgDb = org.Rn.eg.db, # if you unset it uses bg based on provide geneList?
             by = "fgsea",
             minGSSize = 10, 
             maxGSSize = 500, 
             pvalueCutoff = 0.05, 
             verbose = TRUE, 
             pAdjustMethod = "BH")
## using 'fgsea' for GSEA analysis, please cite Korotkevich et al (2019).
## preparing geneSet collections...
## GSEA analysis...
## Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (0.73% of the list).
## The order of those tied genes will be arbitrary, which may produce unexpected results.
## Warning in fgseaMultilevel(pathways = pathways, stats = stats, minSize =
## minSize, : There were 16 pathways for which P-values were not calculated
## properly due to unbalanced (positive and negative) gene-level statistic values.
## For such pathways pval, padj, NES, log2err are set to NA. You can try to
## increase the value of the argument nPermSimple (for example set it nPermSimple
## = 10000)
## Warning in fgseaMultilevel(pathways = pathways, stats = stats, minSize =
## minSize, : For some of the pathways the P-values were likely overestimated. For
## such pathways log2err is set to NA.
## Warning in fgseaMultilevel(pathways = pathways, stats = stats, minSize =
## minSize, : For some pathways, in reality P-values are less than 1e-10. You can
## set the `eps` argument to zero for better estimation.
## leading edge analysis...
## done...
# can make into tibble for easy viewing 
gseGO_cc_tibble <- as_tibble(gseGO_cc) 
# 102 significant beyond adj pvalue 0.05 
enrichplot::dotplot(gseGO_cc,
                    split  = ".sign", # split by + vs - enrirchment score
                    showCategory = 102,
                    font.size = 11,
                    label_format = 55,
                    title = "Housing - EE vs. SH (t8 - t2) - gseGO_CC"
                    ) +
  facet_grid(.~.sign) # split plots by sign

# 6.6 Visualize enriched GO terms as a directed acyclic graph
# goplot requires ggarchery | plot induced GO DAG of significant terms
goplot(gseGO_cc,
       showCategory = 25, # default 10
       layout = "sugiyama") # default
## Warning: ggrepel: 5 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

# Gene-Concept Network with cnetplot()
# 15.4 Heatmap-like functional classification
#15.5 Tree plot
# The treeplot() function performs hierarchical clustering of enriched terms. 
# Relies on the pairwise similarities of the enriched terms calc by the pairwise_termsim() func, which by default using Jaccard’s similarity index (JC).

gseGO_cc_pwts <- pairwise_termsim(gseGO_cc)

treeplot(gseGO_cc_pwts,
         hclust_method = "average", # see hclust () documentation on methods
         showCategory = 70, # default 30
         nCluster = 10, # default 5 | parent term changes a lot & weird assign at times
         )
## Warning in treeplot.enrichResult(x, ...): Use 'cluster.params = list(method = your_value)' instead of 'hclust_method'.
##  The hclust_method parameter will be removed in the next version.
## Warning in treeplot.enrichResult(x, ...): Use 'cluster.params = list(n = your_value)' instead of 'nCluster'.
##  The nCluster parameter will be removed in the next version.

# works but unstable assignment - look into
# 15.6 Enrichment Map 
# also use gseGO_cc_pwts <- pairwise_termsim(gseGO_cc) from above

emapplot(gseGO_cc_pwts, 
         showCategory = 30, # default 30
         # cex_category = 1.5, # can be used to resize nodes | not in ?emapplot?
         # layout = "kk" # change layout | not in ?emapplot? nvm old doesn't accept?
         )

# simplfy
sim_gseGO_cc <- simplify(gseGO_cc_pwts,
                         cutoff=0.7,
                         by="p.adjust",
                         select_fun=min)

# plot simplified
emapplot(sim_gseGO_cc)

NOTE TO SELF - Look at the other plots & what are appropriate presentation guidelines

gseGO MF

gseGO_mf <- gseGO(geneList = geneList, 
             ont ="MF", # molecular function
             keyType = "ENTREZID",
             OrgDb = org.Rn.eg.db, # if you unset it uses bg based on provide geneList?
             by = "fgsea",
             minGSSize = 10, 
             maxGSSize = 500, 
             pvalueCutoff = 0.05, 
             verbose = TRUE, 
             pAdjustMethod = "BH")
## using 'fgsea' for GSEA analysis, please cite Korotkevich et al (2019).
## preparing geneSet collections...
## GSEA analysis...
## Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (0.73% of the list).
## The order of those tied genes will be arbitrary, which may produce unexpected results.
## Warning in fgseaMultilevel(pathways = pathways, stats = stats, minSize =
## minSize, : There were 4 pathways for which P-values were not calculated
## properly due to unbalanced (positive and negative) gene-level statistic values.
## For such pathways pval, padj, NES, log2err are set to NA. You can try to
## increase the value of the argument nPermSimple (for example set it nPermSimple
## = 10000)
## Warning in fgseaMultilevel(pathways = pathways, stats = stats, minSize =
## minSize, : For some of the pathways the P-values were likely overestimated. For
## such pathways log2err is set to NA.
## Warning in fgseaMultilevel(pathways = pathways, stats = stats, minSize =
## minSize, : For some pathways, in reality P-values are less than 1e-10. You can
## set the `eps` argument to zero for better estimation.
## leading edge analysis...
## done...
# can make into tibble for easy viewing 
gseGO_mf_tibble <- as_tibble(gseGO_mf) 
# 63 significant beyond adj pvalue 0.05 
enrichplot::dotplot(gseGO_mf,
                    split  = ".sign", # split by + vs - enrirchment score
                    showCategory = 65,
                    font.size = 11,
                    label_format = 55,
                    title = "Housing - EE vs. SH (t8 - t2) - gseGO_mf"
                    ) +
  facet_grid(.~.sign) # split plots by sign

#15.5 Tree plot
gseGO_mf_pwts <- pairwise_termsim(gseGO_mf)

treeplot(gseGO_mf_pwts,
         hclust_method = "average", # see hclust () documentation on methods
         showCategory = 65, # default 30
         nCluster = 20) # default 5 | parent term changes a lot & weird assign at times
## Warning in treeplot.enrichResult(x, ...): Use 'cluster.params = list(method = your_value)' instead of 'hclust_method'.
##  The hclust_method parameter will be removed in the next version.
## Warning in treeplot.enrichResult(x, ...): Use 'cluster.params = list(n = your_value)' instead of 'nCluster'.
##  The nCluster parameter will be removed in the next version.

# works but unstable assignment - look into
sim_gseGO_mf <- simplify(gseGO_mf_pwts,
                         cutoff=0.7,
                         by="p.adjust",
                         select_fun=min)
# plot simplified
emapplot(sim_gseGO_mf)

goplot(gseGO_mf,
       showCategory = 25, # default 10
       layout = "sugiyama") # default
## Warning: ggrepel: 6 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

gseGO BP

gseGO_bp <- gseGO(geneList = geneList, 
             ont ="BP", # Biological process
             keyType = "ENTREZID",
             OrgDb = org.Rn.eg.db, # if you unset it uses bg based on provide geneList?
             by = "fgsea",
             minGSSize = 10, 
             maxGSSize = 500, 
             pvalueCutoff = 0.05, 
             verbose = TRUE, 
             pAdjustMethod = "BH")
## using 'fgsea' for GSEA analysis, please cite Korotkevich et al (2019).
## preparing geneSet collections...
## GSEA analysis...
## Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (0.73% of the list).
## The order of those tied genes will be arbitrary, which may produce unexpected results.
## Warning in fgseaMultilevel(pathways = pathways, stats = stats, minSize =
## minSize, : There were 18 pathways for which P-values were not calculated
## properly due to unbalanced (positive and negative) gene-level statistic values.
## For such pathways pval, padj, NES, log2err are set to NA. You can try to
## increase the value of the argument nPermSimple (for example set it nPermSimple
## = 10000)
## Warning in fgseaMultilevel(pathways = pathways, stats = stats, minSize =
## minSize, : For some of the pathways the P-values were likely overestimated. For
## such pathways log2err is set to NA.
## Warning in fgseaMultilevel(pathways = pathways, stats = stats, minSize =
## minSize, : For some pathways, in reality P-values are less than 1e-10. You can
## set the `eps` argument to zero for better estimation.
## leading edge analysis...
## done...
# can make into tibble for easy viewing 
gseGO_bp_tibble <- as_tibble(gseGO_bp) 
# 176 significant beyond adj pvalue 0.05 
enrichplot::dotplot(gseGO_bp,
                    split  = ".sign", # split by + vs - enrirchment score
                    showCategory = 176,
                    font.size = 11,
                    label_format = 80,
                    title = "Housing - EE vs. SH (t8 - t2) - gseGO_bp"
                    ) +
  facet_grid(.~.sign) # split plots by sign

#15.5 Tree plot
gseGO_bp_pwts <- pairwise_termsim(gseGO_bp)

treeplot(gseGO_bp_pwts,
         hclust_method = "average", # see hclust () documentation on methods
         showCategory = 176, # default 30
         nCluster = 20, # default 5 | parent term changes a lot & weird assign at times
         )
## Warning in treeplot.enrichResult(x, ...): Use 'cluster.params = list(method = your_value)' instead of 'hclust_method'.
##  The hclust_method parameter will be removed in the next version.
## Warning in treeplot.enrichResult(x, ...): Use 'cluster.params = list(n = your_value)' instead of 'nCluster'.
##  The nCluster parameter will be removed in the next version.

# works but unstable assignment - look into
sim_gseGO_bp <- simplify(gseGO_bp_pwts,
                         cutoff=0.7,
                         by="p.adjust",
                         select_fun=min)
# plot simplified
emapplot(sim_gseGO_cc)

~ ~ ~ ~ ~ ~ ~ ~ ~

KEGG via gseKEGG

# look at rat info kegg
rat <- search_kegg_organism('rno', by='kegg_code')
head(rat)
##    kegg_code   scientific_name common_name
## 33       rno Rattus norvegicus         rat
gseKEGG_r <- gseKEGG(geneList = geneList, 
                     organism = 'rno', #RAT
                     keyType = "kegg", 
                     minGSSize = 10, # min gene set size
                     maxGSSize = 500, # max gene set size
                     pvalueCutoff = 0.05, # pvalue cutoff
                     pAdjustMethod = 'BH', #BejaminiHoc Correciton
                     verbose = TRUE) # print message or not
## Reading KEGG annotation online: "https://rest.kegg.jp/link/rno/pathway"...
## Reading KEGG annotation online: "https://rest.kegg.jp/list/pathway/rno"...
## using 'fgsea' for GSEA analysis, please cite Korotkevich et al (2019).
## preparing geneSet collections...
## GSEA analysis...
## Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (0.73% of the list).
## The order of those tied genes will be arbitrary, which may produce unexpected results.
## Warning in fgseaMultilevel(pathways = pathways, stats = stats, minSize =
## minSize, : There were 2 pathways for which P-values were not calculated
## properly due to unbalanced (positive and negative) gene-level statistic values.
## For such pathways pval, padj, NES, log2err are set to NA. You can try to
## increase the value of the argument nPermSimple (for example set it nPermSimple
## = 10000)
## Warning in fgseaMultilevel(pathways = pathways, stats = stats, minSize =
## minSize, : For some of the pathways the P-values were likely overestimated. For
## such pathways log2err is set to NA.
## Warning in fgseaMultilevel(pathways = pathways, stats = stats, minSize =
## minSize, : For some pathways, in reality P-values are less than 1e-10. You can
## set the `eps` argument to zero for better estimation.
## leading edge analysis...
## done...
gse_kegg_tibble <- as_tibble(gseKEGG_r) # 26 sig or 23?? (0.05)| 32 if adjp = 0.1
enrichplot::dotplot(gseKEGG_r,
                    split = ".sign",  # split by + vs - enrirchment score
                    showCategory = 30,
                    title = "Housing - EE vs. SH (t8 - t2) - gseKEGG_r") +
  facet_grid(.~.sign) # split plot by seign

Visualise enriched KEGG pathways via pathview() & browseKEGG()

Glutamatergic synapse - Rattus norvegicus (rat)

# view browseKEGG from clusterProfiler
# launches to web when function run! 
# browseKEGG(gseKEGG_r,
#            pathID = 'rno04724') # glutamtergic synapse rat

KEGG - Organismal Systems > Nervous System (10 pathways)

# via pathview::pathview()

# rno04724 <- pathview(gene.data = geneList,
#                      pathway.id = 'rno04724', # Glutamatergic synapse - Rat
#                      species = 'rno',
#                      limit = list(gene=max(abs(geneList)), cpd=1))

# writes out png & xml files to source file location
# look into how to plot & also save as PDF
# rno04723 - Retrograde endocannabinoid signaling

# rno04723 <- pathview(gene.data = geneList,
#                      pathway.id = 'rno04723',
#                      species = 'rno',
#                      limit = list(gene=max(abs(geneList)), cpd=1))

# gives error ???
# rno05016 - Huntington disease | runs okay!
# rno05016 <- pathview(gene.data = geneList,
#                      pathway.id = 'rno05016',
#                      species = 'rno',
#                      limit = list(gene=max(abs(geneList)), cpd=1))
# rno04722 - neurotrophin_signaling
# rno04722 <- pathview(gene.data = geneList,
#                      pathway.id = 'rno04722',
#                      species = 'rno',
#                      limit = list(gene=max(abs(geneList)), cpd=1))
# rno04720 - Long Term Potentiation (LTP)
# rno04720 <- pathview(gene.data = geneList,
#                      pathway.id = 'rno04720',
#                      species = 'rno',
#                      limit = list(gene=max(abs(geneList)), cpd=1))
# rno05012 - Parkinsons
# rno05012 <- pathview(gene.data = geneList,
#                      pathway.id = 'rno05012',
#                      species = 'rno',
#                      limit = list(gene=max(abs(geneList)), cpd=1))
# rno04082 - Neuroactive ligand signaling | nice holistic view of many neuroligand paths
# rno04082 <- pathview(gene.data = geneList,
#                      pathway.id = 'rno04082',
#                      species = 'rno',
#                      limit = list(gene=max(abs(geneList)), cpd=1))
# rno05033 - Nicotine Addiction | not much just ampa & gaba
# rno05033 <- pathview(gene.data = geneList,
#                      pathway.id = 'rno05033',
#                      species = 'rno',
#                      limit = list(gene=max(abs(geneList)), cpd=1))

~ ~ ~

# rno04721 - Synaptic_vesicle_cycle | most up regulated in EE 
# rno04721 <- pathview(gene.data = geneList,
#                      pathway.id = 'rno04721',
#                      species = 'rno',
#                      limit = list(gene=max(abs(geneList)), cpd=1))
# rno04727 - GABAergic_synapse_ 
# rno04727 <- pathview(gene.data = geneList,
#                      pathway.id = 'rno04727',
#                      species = 'rno',
#                      limit = list(gene=max(abs(geneList)), cpd=1))
# rno04728 - Dompaminergic_synapse_
# rno04728 <- pathview(gene.data = geneList,
#                      pathway.id = 'rno04728',
#                      species = 'rno',
#                      limit = list(gene=max(abs(geneList)), cpd=1))
# rno04726 - Serotonergic_synapse_
# rno04726 <- pathview(gene.data = geneList,
#                      pathway.id = 'rno04726',
#                      species = 'rno',
#                      limit = list(gene=max(abs(geneList)), cpd=1))
# rno04725 - Cholinergic_synapse_ | Interesting seems 2 be mediating synaptic plasticity
# rno04725 <- pathview(gene.data = geneList,
#                      pathway.id = 'rno04725',
#                      species = 'rno',
#                      limit = list(gene=max(abs(geneList)), cpd=1))
# rno04730 - LTD_ | cerebellum pathview? | otherwise interesting
# rno04730 <- pathview(gene.data = geneList,
#                      pathway.id = 'rno04730',
#                      species = 'rno',
#                      limit = list(gene=max(abs(geneList)), cpd=1))

Environmental Information Processing > Signal Transduction

# rno04010 - MAPK_signaling_ | Interesting - cell proliferation & differentiation
# rno04010 <- pathview(gene.data = geneList,
#                      pathway.id = 'rno04010',
#                      species = 'rno',
#                      limit = list(gene=max(abs(geneList)), cpd=1))
# rno04014 - Ras_signaling_
# rno04014 <- pathview(gene.data = geneList,
#                      pathway.id = 'rno04014',
#                      species = 'rno',
#                      limit = list(gene=max(abs(geneList)), cpd=1))

Most significant Pathways

1 - Oxidative Phosphorylation (rno00190) - Metabolism >Energy Metabolism > Ox Pho

# rno00190 - Oxidative_phosphorylation_ | several highly upregulated
# rno00190 <- pathview(gene.data = geneList,
#                      pathway.id = 'rno00190',
#                      species = 'rno',
#                      limit = list(gene=max(abs(geneList)), cpd=1))

2 rno05415 Diabetic_cardiomyopathy - Human Diseases; Cardiovascular Disease

# rno05415 - Diabetic_cardiomyopathy_ | upreg @ mitocondrial - what does this mean?
# rno05415 <- pathview(gene.data = geneList,
#                      pathway.id = 'rno05415',
#                      species = 'rno',
#                      limit = list(gene=max(abs(geneList)), cpd=1))

3 - Non-alcoholic fatty liver disease rno04932 - Endocrine & metabolic disease

# # rno04932 - Non-alcoholic_fatty_liver_d_ | driven by cx1,3,4 mito resp chain
# rno04932 <- pathview(gene.data = geneList,
#                      pathway.id = 'rno04932',
#                      species = 'rno',
#                      limit = list(gene=max(abs(geneList)), cpd=1))

4 - Thermogenesis_ (rno04714) - adipose tissue & keep warm | fat SH rats & skinny EE

# rno04714 - Thermogenesis_ | mitochondrial + others - look into
# rno04714 <- pathview(gene.data = geneList,
#                      pathway.id = 'rno04714',
#                      species = 'rno',
#                      limit = list(gene=max(abs(geneList)), cpd=1))

9 - Pathways of neurodegeneration - multiple diseases (rno05022) (neurodegen disea

# rno05022 - Pathways_of_neurodegeneration_multi_ |interesting/confusing
# rno05022 <- pathview(gene.data = geneList,
#                      pathway.id = 'rno05022',
#                      species = 'rno',
#                      limit = list(gene=max(abs(geneList)), cpd=1))

Not enriched but interesting

# #rno04081 - Hormone_signaling_ | various guanine nucleotide binding pro & PKA down
# # and only SST (somatostatin) & Opioid (Penk) Up | also only brain specific up
# rno04081 <- pathview(gene.data = geneList,
#                      pathway.id = 'rno04081',
#                      species = 'rno',
#                      limit = list(gene=max(abs(geneList)), cpd=1))
# rno04080 - Neuroactive_ligand_receptor_interaction_ | not many but interesting - look
# rno04080 <- pathview(gene.data = geneList,
#                      pathway.id = 'rno04080',
#                      species = 'rno',
#                      limit = list(gene=max(abs(geneList)), cpd=1))
# rno04060 - Cytokine_cytokine_receptor_interaction_ | no up or down
# rno04060 <- pathview(gene.data = geneList,
#                      pathway.id = 'rno04060',
#                      species = 'rno',
#                      limit = list(gene=max(abs(geneList)), cpd=1))
# rno04512 - ECM_receptor_interaction_ | all down 
# rno04512 <- pathview(gene.data = geneList,
#                      pathway.id = 'rno04512',
#                      species = 'rno',
#                      limit = list(gene=max(abs(geneList)), cpd=1))
# rno04514 - Cell_adhesion_molecules_ | no up or down except CXADR down
# rno04514 <- pathview(gene.data = geneList,
#                      pathway.id = 'rno04514',
#                      species = 'rno',
#                      limit = list(gene=max(abs(geneList)), cpd=1))
# rno04360 - Axon_guidance_ | most down except Ras
# rno04360 <- pathview(gene.data = geneList,
#                      pathway.id = 'rno04360',
#                      species = 'rno',
#                      limit = list(gene=max(abs(geneList)), cpd=1))

~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~

KEGG Module GSEA via gseMKEGG()

gseMKEGG_r <- gseMKEGG(geneList = geneList,
                       organism = 'rno',
                       pvalueCutoff = 1)
## Reading KEGG annotation online: "https://rest.kegg.jp/link/rno/module"...
## Reading KEGG annotation online: "https://rest.kegg.jp/list/module"...
## using 'fgsea' for GSEA analysis, please cite Korotkevich et al (2019).
## preparing geneSet collections...
## GSEA analysis...
## Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (0.73% of the list).
## The order of those tied genes will be arbitrary, which may produce unexpected results.
## leading edge analysis...
## done...
## --> Not a KEGG enrichment result
head(gseMKEGG_r)
##            ID
## M00146 M00146
## M00158 M00158
## M00154 M00154
## M00009 M00009
## M00147 M00147
## M00011 M00011
##                                                                   Description
## M00146                     NADH dehydrogenase (ubiquinone) 1 alpha subcomplex
## M00158                                              F-type ATPase, eukaryotes
## M00154                                                   Cytochrome c oxidase
## M00009                                 Citrate cycle (TCA cycle, Krebs cycle)
## M00147                      NADH dehydrogenase (ubiquinone) 1 beta subcomplex
## M00011 Citrate cycle, second carbon oxidation, 2-oxoglutarate => oxaloacetate
##        setSize enrichmentScore      NES       pvalue     p.adjust       qvalue
## M00146      12       0.8917876 2.424023 6.069907e-07 1.031884e-05 4.472563e-06
## M00158      14       0.8231009 2.324812 8.226993e-06 6.992944e-05 3.030998e-05
## M00154      12       0.8503936 2.311508 1.236428e-05 7.006423e-05 3.036840e-05
## M00009      21       0.7081324 2.200668 6.558425e-05 2.787330e-04 1.208131e-04
## M00147      10       0.8376506 2.172511 1.404557e-04 4.775495e-04 2.069874e-04
## M00011      13       0.7771340 2.168138 1.833869e-04 5.195963e-04 2.252120e-04
##        rank                   leading_edge
## M00146  697 tags=83%, list=10%, signal=75%
## M00158  902 tags=71%, list=13%, signal=63%
## M00154  595  tags=83%, list=8%, signal=77%
## M00009  641  tags=57%, list=9%, signal=52%
## M00147 1054 tags=90%, list=15%, signal=77%
## M00011  641  tags=54%, list=9%, signal=49%
##                                                                        core_enrichment
## M00146        296658/293453/301123/678759/299739/362440/315167/100911483/681024/291660
## M00158             116550/171374/65262/26196/192241/300677/641434/171375/140608/245965
## M00154              94194/120103152/303393/26198/54322/29507/252934/298762/29445/25282
## M00009 94173/360975/170587/79250/299201/25179/114597/114096/289217/24368/361071/298942
## M00147               689938/301427/361385/293991/681418/299310/293130/100912357/297990
## M00011                                 360975/299201/114597/289217/24368/361071/298942
gseMKEGG_r_tibble <- as_tibble(gseMKEGG_r) # 17 sig @ adj-pval .05? | why spec as 1 abv?
# im confused - loot at gseMKEGG function later
enrichplot::dotplot(gseMKEGG_r,
                    split = ".sign",  # split by + vs - enrirchment score
                    showCategory = 30,
                    title = "Housing - EE vs. SH (t8 - t2) - gseMKEGG_r") +
  facet_grid(.~.sign) # split plot by seign